In [3]:
import sys,os,gzip
from collections import defaultdict
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
%load_ext sql
In [4]:
sys.path.append(os.path.sep.join(os.path.split(os.getcwd())[:-1]))
In [5]:
import splitter
Our test set here includes the 1.3 million molecules from ChEMBL20 with MW < 600 that could be successfully processed by the RDKit.
We use the Standard InChI that comes with ChEMBL and a non-standard InChI (options "/FixedH /SUU") that allows tautomers to be distinguished. Here's the sequence of psql commands used to generate that set:
create temporary view molregno_lookup as select entity_id molregno,chembl_id from chembl_id_lookup where entity_type = 'COMPOUND';
select * into temporary table small_compounds from compound_structures join compound_properties using (molregno) where mw_freebase<600;
\f ' '
\a
\o chembl_export.txt
select chembl_id,standard_inchi,standard_inchi_key,mol_inchi(m,'/FixedH /SUU') nonstandard_inchi, mol_inchikey(m,'/FixedH /SUU'), canonical_smiles from small_compounds join rdk.mols using (molregno) join molregno_lookup using(molregno) ;
In [7]:
%sql postgresql://localhost/inchi_split \
select count(*) from chembl_export_nonstandard;
Out[7]:
In [9]:
d = %sql \
select formula,count(chemblid) freq from chembl_export_nonstandard group by formula \
order by freq desc limit 10;
d
Out[9]:
In [11]:
d = %sql \
select formula,skeleton,hydrogens,count(chemblid) freq from chembl_export_nonstandard group by \
(formula,skeleton,hydrogens) \
order by freq desc limit 10;
Look at a few of the common main layer groups
In [12]:
d[:5]
Out[12]:
In [19]:
tpl=d[0][:-1]
print(tpl)
rows = %sql \
select chemblid,smiles from chembl_export join chembl_export_nonstandard using (chemblid) where \
(formula,skeleton,hydrogens) = :tpl
cids = [x for x,y in rows]
ms = [Chem.MolFromSmiles(y) for x,y in rows]
Draw.MolsToGridImage(ms,legends=cids)
Out[19]:
In [21]:
tpl=d[1][:-1]
print(tpl)
rows = %sql \
select chemblid,smiles from chembl_export join chembl_export_nonstandard using (chemblid) where \
(formula,skeleton,hydrogens) = :tpl
cids = [x for x,y in rows]
ms = [Chem.MolFromSmiles(y) for x,y in rows]
Draw.MolsToGridImage(ms,legends=cids)
Out[21]:
In [20]:
tpl=d[3][:-1]
print(tpl)
rows = %sql \
select chemblid,smiles from chembl_export join chembl_export_nonstandard using (chemblid) where \
(formula,skeleton,hydrogens) = :tpl
cids = [x for x,y in rows]
ms = [Chem.MolFromSmiles(y) for x,y in rows]
Draw.MolsToGridImage(ms,legends=cids)
Out[20]:
In [26]:
d = %sql \
select formula,skeleton,hydrogens,charge,protonation,count(chemblid) freq from chembl_export_nonstandard group by \
(formula,skeleton,hydrogens,charge,protonation) \
order by freq desc limit 10;
d[:5]
Out[26]:
We saw those already
In [27]:
d = %sql \
select formula,skeleton,hydrogens,charge,protonation,stereo_bond,stereo_tet,stereo_m,stereo_s,count(chemblid) freq from chembl_export_nonstandard group by \
(formula,skeleton,hydrogens,charge,protonation,stereo_bond,stereo_tet,stereo_m,stereo_s) \
order by freq desc limit 10;
d[:5]
Out[27]:
In [33]:
tpl=d[0][:-1]
tpl = tuple(x if x is not None else '' for x in tpl)
print(tpl)
rows = %sql \
select chemblid,smiles from chembl_export join chembl_export_nonstandard using (chemblid) where \
(formula,skeleton,hydrogens,\
coalesce(charge,''),coalesce(protonation,''),coalesce(stereo_bond,''),\
coalesce(stereo_tet,''),coalesce(stereo_m,''),coalesce(stereo_s,'')) = :tpl
cids = [x for x,y in rows]
ms = [Chem.MolFromSmiles(y) for x,y in rows]
Draw.MolsToGridImage(ms,legends=cids)
Out[33]:
In [37]:
tpl=d[3][:-1]
tpl = tuple(x if x is not None else '' for x in tpl)
print(tpl)
rows = %sql \
select chemblid,smiles from chembl_export join chembl_export_nonstandard using (chemblid) where \
(formula,skeleton,hydrogens,\
coalesce(charge,''),coalesce(protonation,''),coalesce(stereo_bond,''),\
coalesce(stereo_tet,''),coalesce(stereo_m,''),coalesce(stereo_s,'')) = :tpl
cids = [x for x,y in rows]
ms = [Chem.MolFromSmiles(y) for x,y in rows]
Draw.MolsToGridImage(ms,legends=cids)
Out[37]:
In [45]:
d = %sql \
select formula,skeleton,hydrogens,charge,protonation,isotope,count(chemblid) freq \
from chembl_export_nonstandard \
group by \
(formula,skeleton,hydrogens,charge,protonation,isotope) \
order by freq desc limit 10;
d[:5]
Out[45]:
Those we've seen before at the skeleton grouping level. Let's see some that actually include isotope info:
In [58]:
d = %sql \
select formula,skeleton,hydrogens,charge,protonation,isotope,count(chemblid) freq \
from chembl_export_nonstandard where isotope is not null\
group by \
(formula,skeleton,hydrogens,charge,protonation,isotope) \
order by freq desc limit 10;
d[:5]
Out[58]:
In [47]:
tpl=d[0][:-1]
tpl = tuple(x if x is not None else '' for x in tpl)
print(tpl)
rows = %sql \
select chemblid,smiles from chembl_export join chembl_export_nonstandard using (chemblid) where \
(formula,skeleton,hydrogens,\
coalesce(charge,''),coalesce(protonation,''),isotope) = :tpl
cids = [x for x,y in rows]
ms = [Chem.MolFromSmiles(y) for x,y in rows]
Draw.MolsToGridImage(ms,legends=cids)
Out[47]:
This is a good one to spend a bit of time with. Let's look at the other members of that family when we ignore isotopes (and just look at the main):
In [48]:
ttpl = tpl[:-1]
print(ttpl)
rows = %sql \
select chemblid,smiles from chembl_export join chembl_export_nonstandard using (chemblid) where \
(formula,skeleton,hydrogens,\
coalesce(charge,''),coalesce(protonation,'')) = :ttpl
cids = [x for x,y in rows]
ms = [Chem.MolFromSmiles(y) for x,y in rows]
Draw.MolsToGridImage(ms,legends=cids)
Out[48]:
In [59]:
rows = %sql \
select chemblid,smiles from chembl_export join chembl_export_nonstandard using (chemblid) where \
isotope_stereo_tet is not null and stereo_tet!=isotope_stereo_tet
cids = [x for x,y in rows]
ms = [Chem.MolFromSmiles(y) for x,y in rows]
len(rows)
Out[59]:
In [53]:
Draw.MolsToGridImage(ms[:6],legends=cids)
Out[53]:
Most of those have the labelled atom involved in an unknown stereocenter (the second is the exception), see if we can find more of those:
In [60]:
rows = %sql \
select chemblid,smiles from chembl_export join chembl_export_nonstandard using (chemblid) where \
isotope_stereo_tet is not null and position('?' in isotope_stereo_tet)<=0 and stereo_tet!=isotope_stereo_tet
cids = [x for x,y in rows]
ms = [Chem.MolFromSmiles(y) for x,y in rows]
len(rows)
Out[60]:
In [61]:
Draw.MolsToGridImage(ms,legends=cids,subImgSize=(300,300))
Out[61]:
In [49]:
smis[tItems[-2][1]]
Out[49]:
In [ ]: